Clustering

library(tidyverse)

Load data

dir.short <- "data/individual_book_train"
all.files.short <- list.files(dir.short)

raw_df_cl = data.frame(name = character(),
                       time_id = numeric(),
                       mean_BAS = numeric(),
                       mean_WAP = numeric(),
                       imbalance = numeric(),
                       stringsAsFactors = FALSE)

for (i in all.files.short) {
  stock = read.csv(file.path(dir.short, i))
  
  # randomly select time_ids from stock
  time_ids = sample(unique(stock$time_id), 10)
  
  file_name = gsub("stock_", "", (gsub(".csv", "", i)))
  
  stock = stock |> 
      filter(time_id %in% time_ids) |>
      mutate(WAP = (bid_price1 * ask_size1 + ask_price1 * bid_size1) / (bid_size1 + ask_size1)) |>
      mutate(BidAskSpread = ask_price1 / bid_price1 - 1) |>
      mutate(imbalance = (bid_size1 - ask_size1) / (bid_size1 + ask_size1)) |>
      group_by(time_id) |>
      summarise(mean_BAS = mean(BidAskSpread),
                mean_WAP = mean(WAP),
                imbalance = mean(imbalance)) |>
      mutate(file_name = file_name) 
  
  raw_df_cl = rbind(raw_df_cl, stock)
  
  print(file_name)

}
[1] "0"
[1] "1"
[1] "10"
[1] "100"
[1] "101"
[1] "102"
[1] "103"
[1] "104"
[1] "105"
[1] "107"
[1] "108"
[1] "109"
[1] "11"
[1] "110"
[1] "111"
[1] "112"
[1] "113"
[1] "114"
[1] "115"
[1] "116"
[1] "118"
[1] "119"
[1] "120"
[1] "122"
[1] "123"
[1] "124"
[1] "125"
[1] "126"
[1] "13"
[1] "14"
[1] "15"
[1] "16"
[1] "17"
[1] "18"
[1] "19"
[1] "2"
[1] "20"
[1] "21"
[1] "22"
[1] "23"
[1] "26"
[1] "27"
[1] "28"
[1] "29"
[1] "3"
[1] "30"
[1] "31"
[1] "32"
[1] "33"
[1] "34"
[1] "35"
[1] "36"
[1] "37"
[1] "38"
[1] "39"
[1] "4"
[1] "40"
[1] "41"
[1] "42"
[1] "43"
[1] "44"
[1] "46"
[1] "47"
[1] "48"
[1] "5"
[1] "50"
[1] "51"
[1] "52"
[1] "53"
[1] "55"
[1] "56"
[1] "58"
[1] "59"
[1] "6"
[1] "60"
[1] "61"
[1] "62"
[1] "63"
[1] "64"
[1] "66"
[1] "67"
[1] "68"
[1] "69"
[1] "7"
[1] "70"
[1] "72"
[1] "73"
[1] "74"
[1] "75"
[1] "76"
[1] "77"
[1] "78"
[1] "8"
[1] "80"
[1] "81"
[1] "82"
[1] "83"
[1] "84"
[1] "85"
[1] "86"
[1] "87"
[1] "88"
[1] "89"
[1] "9"
[1] "90"
[1] "93"
[1] "94"
[1] "95"
[1] "96"
[1] "97"
[1] "98"
[1] "99"

Apply clustering

library(caret)

df_cl = raw_df_cl

df_cl$mean_BAS <- as.vector(scale(df_cl$mean_BAS, center = min(df_cl$mean_BAS), scale = max(df_cl$mean_BAS) - min(df_cl$mean_BAS)))
df_cl$mean_WAP = as.vector(scale(df_cl$mean_WAP, center = min(df_cl$mean_WAP), scale = max(df_cl$mean_WAP) - min(df_cl$mean_WAP)))
df_cl$imbalance = as.vector(scale(df_cl$imbalance, center = min(df_cl$imbalance), scale = max(df_cl$imbalance) - min(df_cl$imbalance)))

df_cl$file_name = as.numeric(df_cl$file_name)
df_cl = df_cl[order(df_cl$file_name),]
df_cl$name = paste(df_cl$file_name, df_cl$time_id, sep = " ")

df_cl = df_cl |>
    filter(file_name != 31) |>
    select(-time_id, -file_name) |>
    select(-mean_WAP)

df_cl <- df_cl[c("name", "mean_BAS", "imbalance")]

Find optimal k - skree plot

# Initialize total within sum of squares error: wss
wss <- 0

# For 1 to 15 cluster centers
for (i in 1:15) {
  km.out <- kmeans(df_cl[-1], centers = i, nstart = 20)
  # Save total within sum of squares to wss variable
  wss[i] <- km.out$tot.withinss
}

# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

# Set k equal to the number of clusters corresponding to the elbow location
k <- 4

Cluster with k = 4

km.out <- kmeans(df_cl[-1], centers = k, nstart = 20)

df = data.frame(
    names = df_cl$name,
    mean_BAS = df_cl$mean_BAS,
    imbalance = df_cl$imbalance,
    cluster = factor(km.out$cluster)
)

plot = ggplot(df, aes(x = mean_BAS, y = imbalance, color = cluster, label = names)) + 
  geom_point() + 
  geom_text(aes(label=names), vjust = -1, hjust = 1) +
  theme_minimal() +
  labs(title = "Cluster Plot", x = "mean_BAS", y = "imbalance")

library(plotly)
ggplotly(plot)